Skip to main content

pdf_syntax/object/
stream.rs

1//! Streams.
2
3use crate::crypto::DecryptionTarget;
4use crate::filter::Filter;
5use crate::object;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::dict::keys::{DECODE_PARMS, DP, F, FILTER, LENGTH, TYPE};
9use crate::object::{Array, ObjectIdentifier};
10use crate::object::{Object, ObjectLike};
11use crate::reader::Reader;
12use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
13use crate::sync::Arc;
14use crate::util::OptionLog;
15use alloc::borrow::Cow;
16use alloc::vec::Vec;
17use core::fmt::{Debug, Formatter};
18use log::warn;
19use smallvec::SmallVec;
20
21#[derive(Clone)]
22struct StreamInner<'a> {
23    dict: Dict<'a>,
24    filters: SmallVec<[Filter; 2]>,
25    filter_params: SmallVec<[Dict<'a>; 2]>,
26    data: &'a [u8],
27    /// Maximum decoded size enforced in `decoded()` / `decoded_image()`.
28    /// Comes from [`PdfLoadLimits::stream_byte_limit`] at parse time.
29    /// `u64::MAX` means no limit (same sentinel as [`PdfLoadLimits::max_stream_bytes`]).
30    stream_byte_limit: u64,
31}
32
33/// A stream of arbitrary data.
34#[derive(Clone)]
35pub struct Stream<'a>(Arc<StreamInner<'a>>);
36
37impl PartialEq for Stream<'_> {
38    fn eq(&self, other: &Self) -> bool {
39        self.0.dict == other.0.dict && self.0.data == other.0.data
40    }
41}
42
43/// Additional parameters for decoding images.
44#[derive(Clone, PartialEq, Default)]
45pub struct ImageDecodeParams {
46    /// Whether the color space of the image is an indexed color space.
47    pub is_indexed: bool,
48    /// The bits per component of the image, if that information is available.
49    pub bpc: Option<u8>,
50    /// The components per channel of the image, if that information is available.
51    pub num_components: Option<u8>,
52    /// A target resolution for the image. Note that this is only a hint so that
53    /// in case it's possible, a version of the image will be extracted that
54    /// is as close as possible to the hinted dimension.
55    pub target_dimension: Option<(u32, u32)>,
56    /// The width of the image as indicated by the image dictionary.
57    pub width: u32,
58    /// The height of the image as indicated by the image dictionary.
59    pub height: u32,
60}
61
62impl<'a> Stream<'a> {
63    pub(crate) fn new(data: &'a [u8], dict: Dict<'a>, stream_byte_limit: u64) -> Self {
64        let mut collected_filters = SmallVec::new();
65        let mut collected_params = SmallVec::new();
66
67        if let Some(filter) = dict
68            .get::<Name>(F)
69            .or_else(|| dict.get::<Name>(FILTER))
70            .and_then(Filter::from_name)
71        {
72            let params = dict
73                .get::<Dict<'_>>(DP)
74                .or_else(|| dict.get::<Dict<'_>>(DECODE_PARMS))
75                .unwrap_or_default();
76
77            collected_filters.push(filter);
78            collected_params.push(params);
79        } else if let Some(filters) = dict
80            .get::<Array<'_>>(F)
81            .or_else(|| dict.get::<Array<'_>>(FILTER))
82        {
83            let filters = filters.iter::<Name>().map(Filter::from_name);
84            let mut params = dict
85                .get::<Array<'_>>(DP)
86                .or_else(|| dict.get::<Array<'_>>(DECODE_PARMS))
87                .map(|a| a.iter::<Object<'_>>());
88
89            for filter in filters {
90                let params = params
91                    .as_mut()
92                    .and_then(|p| p.next())
93                    .and_then(|p| p.into_dict())
94                    .unwrap_or_default();
95
96                if let Some(filter) = filter {
97                    collected_filters.push(filter);
98                    collected_params.push(params);
99                }
100            }
101        }
102
103        Self(Arc::new(StreamInner {
104            dict,
105            filters: collected_filters,
106            filter_params: collected_params,
107            data,
108            stream_byte_limit,
109        }))
110    }
111
112    /// Return the raw, decrypted data of the stream.
113    ///
114    /// Stream filters will not be applied.
115    pub fn raw_data(&self) -> Cow<'a, [u8]> {
116        let ctx = self.0.dict.ctx();
117
118        if ctx.xref().needs_decryption(ctx)
119            && self
120                .0
121                .dict
122                .get::<object::String>(TYPE)
123                .map(|t| t.as_ref() != b"XRef")
124                .unwrap_or(true)
125        {
126            // Streams are always indirect objects and therefore always have an obj_id.
127            // If somehow absent (corrupt PDF), fall back to raw data.
128            if let Some(obj_id) = self.0.dict.obj_id() {
129                Cow::Owned(
130                    ctx.xref()
131                        .decrypt(obj_id, self.0.data, DecryptionTarget::Stream)
132                        .unwrap_or_default(),
133                )
134            } else {
135                Cow::Borrowed(self.0.data)
136            }
137        } else {
138            Cow::Borrowed(self.0.data)
139        }
140    }
141
142    /// Return the raw, underlying dictionary of the stream.
143    pub fn dict(&self) -> &Dict<'a> {
144        &self.0.dict
145    }
146
147    /// Return the object identifier of the stream, if available.
148    ///
149    /// Returns `None` if the stream is corrupt and lacks an object ID.
150    pub fn obj_id(&self) -> Option<ObjectIdentifier> {
151        self.0.dict.obj_id()
152    }
153
154    /// Return the filters that are applied to the stream.
155    pub fn filters(&self) -> &[Filter] {
156        &self.0.filters
157    }
158
159    /// Return the decoded data of the stream.
160    ///
161    /// Note that the result of this method will not be cached, so calling it multiple
162    /// times is expensive.
163    pub fn decoded(&self) -> Result<Vec<u8>, DecodeFailure> {
164        self.decoded_image(&ImageDecodeParams::default())
165            .map(|r| r.data)
166    }
167
168    /// Return the decoded data of the stream, and return image metadata
169    /// if available.
170    pub fn decoded_image(
171        &self,
172        image_params: &ImageDecodeParams,
173    ) -> Result<FilterResult, DecodeFailure> {
174        if let Some(limit) = self.0.dict.ctx().load_limits().image_pixel_limit()
175            && image_params.width > 0
176            && image_params.height > 0
177        {
178            let pixels =
179                u64::from(image_params.width).saturating_mul(u64::from(image_params.height));
180            if pixels > u64::from(limit) {
181                warn!("image pixel count {pixels} exceeds limit {limit}, stopping image decode");
182                return Err(DecodeFailure::ImageDecode);
183            }
184        }
185
186        let data = self.raw_data();
187
188        let mut current: Option<FilterResult> = None;
189
190        for (filter, params) in self.0.filters.iter().zip(self.0.filter_params.iter()) {
191            let new = filter.apply(
192                current.as_ref().map(|c| c.data.as_ref()).unwrap_or(&data),
193                params.clone(),
194                image_params,
195            )?;
196            current = Some(new);
197        }
198
199        let result = current.unwrap_or(FilterResult {
200            data: data.to_vec(),
201            image_data: None,
202        });
203
204        let limit = self.0.stream_byte_limit;
205        if limit != u64::MAX {
206            let observed = result.data.len() as u64;
207            if observed > limit {
208                warn!("decoded stream size {observed} exceeds limit {limit}, stopping decode");
209                return Err(DecodeFailure::StreamTooLarge { observed, limit });
210            }
211        }
212
213        Ok(result)
214    }
215}
216
217impl Debug for Stream<'_> {
218    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
219        write!(f, "Stream (len: {:?})", self.0.data.len())
220    }
221}
222
223impl Skippable for Stream<'_> {
224    fn skip(_: &mut Reader<'_>, _: bool) -> Option<()> {
225        // A stream can never appear in a dict/array, so it should never be skipped.
226        warn!("attempted to skip a stream object");
227
228        None
229    }
230}
231
232impl<'a> Readable<'a> for Stream<'a> {
233    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
234        let dict = r.read_with_context::<Dict<'_>>(ctx)?;
235
236        if dict.contains_key(F) {
237            warn!("encountered stream referencing external file, which is unsupported");
238
239            return None;
240        }
241
242        let stream_byte_limit = ctx.load_limits().stream_byte_limit().unwrap_or(u64::MAX);
243        let offset = r.offset();
244        parse_proper(r, &dict, stream_byte_limit)
245            .or_else(|| {
246                warn!("failed to parse stream, trying to parse it manually");
247
248                r.jump(offset);
249                parse_fallback(r, &dict, stream_byte_limit)
250            })
251            .error_none("was unable to manually parse the stream")
252    }
253}
254
255#[derive(Debug, Copy, Clone)]
256/// A failure that can occur during decoding a data stream.
257pub enum DecodeFailure {
258    /// An image stream failed to decode.
259    ImageDecode,
260    /// A data stream failed to decode.
261    StreamDecode,
262    /// A failure occurred while decrypting a file.
263    Decryption,
264    /// An unknown failure occurred.
265    Unknown,
266    /// The decoded stream exceeds the configured
267    /// [`PdfLoadLimits::max_stream_bytes`](crate::pdf::PdfLoadLimits) limit.
268    ///
269    /// Unlike the other variants, this is a **hard stop** — callers must not
270    /// silently discard it with `.ok()`. Propagate it as
271    /// `LimitError::StreamTooLarge` / `Error::ResourceLimitExceeded`.
272    StreamTooLarge {
273        /// Decoded byte count that triggered the limit.
274        observed: u64,
275        /// The configured limit.
276        limit: u64,
277    },
278}
279
280/// An image color space.
281#[derive(Debug, Copy, Clone)]
282pub enum ImageColorSpace {
283    /// Grayscale color space.
284    Gray,
285    /// RGB color space.
286    Rgb,
287    /// RGB produced by JPEG YCbCr→RGB decoding.
288    ///
289    /// JPEG images stored with YCbCr encoding (Adobe APP14 transform=1 or
290    /// JFIF default) are converted to sRGB by the JPEG decoder using the
291    /// standard BT.601 matrix. The resulting RGB values are already in sRGB
292    /// colorimetry. Any PDF `/ColorSpace` entry that is not device-RGB (e.g.
293    /// an [`ICCBased`] printer profile) should be ignored for these images;
294    /// the JPEG decoder's own colour model takes precedence, matching MuPDF
295    /// and Acrobat behaviour.
296    RgbFromYCbCr,
297    /// CMYK color space.
298    Cmyk,
299    /// An unknown color space.
300    Unknown(u8),
301}
302
303/// Additional data that is extracted from some image streams.
304pub struct ImageData {
305    /// An optional alpha channel of the image.
306    pub alpha: Option<Vec<u8>>,
307    /// The color space of the image.
308    pub color_space: Option<ImageColorSpace>,
309    /// The bits per component of the image.
310    pub bits_per_component: u8,
311    /// The width of the image.
312    pub width: u32,
313    /// The height of the image.
314    pub height: u32,
315}
316
317/// The result of applying a filter.
318pub struct FilterResult {
319    /// The decoded data.
320    pub data: Vec<u8>,
321    /// Additional data that is extracted from JPX image streams.
322    pub image_data: Option<ImageData>,
323}
324
325impl FilterResult {
326    pub(crate) fn from_data(data: Vec<u8>) -> Self {
327        Self {
328            data,
329            image_data: None,
330        }
331    }
332}
333
334fn parse_proper<'a>(
335    r: &mut Reader<'a>,
336    dict: &Dict<'a>,
337    stream_byte_limit: u64,
338) -> Option<Stream<'a>> {
339    let length = dict.get::<u32>(LENGTH)?;
340
341    r.skip_white_spaces_and_comments();
342    r.forward_tag(b"stream")?;
343    // Skip horizontal whitespace (spaces/tabs) between "stream" keyword and EOL.
344    // Some producers write "stream \r\n" (with a trailing space) which is technically
345    // non-conforming but tolerated by Acrobat and MuPDF.
346    while r.peek_byte().is_some_and(|b| b == b' ' || b == b'\t') {
347        r.forward();
348    }
349    r.forward_tag(b"\n")
350        .or_else(|| r.forward_tag(b"\r\n"))
351        .or_else(|| r.forward_tag(b"\r"))?;
352    let data = r.read_bytes(length as usize)?;
353    r.skip_white_spaces();
354    r.forward_tag(b"endstream")?;
355
356    Some(Stream::new(data, dict.clone(), stream_byte_limit))
357}
358
359fn parse_fallback<'a>(
360    r: &mut Reader<'a>,
361    dict: &Dict<'a>,
362    stream_byte_limit: u64,
363) -> Option<Stream<'a>> {
364    while r.forward_tag(b"stream").is_none() {
365        r.read_byte()?;
366    }
367
368    // Skip any horizontal whitespace between "stream" keyword and EOL (same lenience as
369    // parse_proper — some producers write "stream \r\n").
370    while r.peek_byte().is_some_and(|b| b == b' ' || b == b'\t') {
371        r.forward();
372    }
373    r.forward_tag(b"\n")
374        .or_else(|| r.forward_tag(b"\r\n"))
375        // Technically not allowed, but no reason to not try it.
376        .or_else(|| r.forward_tag(b"\r"))?;
377
378    let data_start = r.tail()?;
379    let start = r.offset();
380
381    loop {
382        if r.peek_byte()?.is_ascii_whitespace() || r.peek_tag(b"endstream").is_some() {
383            let length = r.offset() - start;
384            let data = data_start.get(..length)?;
385
386            r.skip_white_spaces();
387
388            // This was just a whitespace in the data stream but not actually marking the end
389            // of the stream, so continue searching.
390            if r.forward_tag(b"endstream").is_none() {
391                continue;
392            }
393
394            let stream = Stream::new(data, dict.clone(), stream_byte_limit);
395
396            // Seems like we found the end!
397            return Some(stream);
398        } else {
399            r.read_byte()?;
400        }
401    }
402}
403
404impl<'a> TryFrom<Object<'a>> for Stream<'a> {
405    type Error = ();
406
407    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
408        match value {
409            Object::Stream(s) => Ok(s),
410            _ => Err(()),
411        }
412    }
413}
414
415impl<'a> ObjectLike<'a> for Stream<'a> {}
416
417#[cfg(test)]
418mod tests {
419    use crate::object::Stream;
420    use crate::pdf::PdfLoadLimits;
421    use crate::reader::Reader;
422    use crate::reader::{ReaderContext, ReaderExt};
423
424    use super::DecodeFailure;
425
426    #[test]
427    fn stream() {
428        let data = b"<< /Length 10 >> stream\nabcdefghij\nendstream";
429        let mut r = Reader::new(data);
430        let stream = r
431            .read_with_context::<Stream<'_>>(&ReaderContext::dummy())
432            .unwrap();
433
434        assert_eq!(stream.0.data, b"abcdefghij");
435    }
436
437    /// `decoded()` on an unfiltered stream succeeds when no byte limit is set.
438    #[test]
439    fn decoded_no_limit() {
440        let data = b"<< /Length 5 >> stream\nhello\nendstream";
441        let mut r = Reader::new(data);
442        let stream = r
443            .read_with_context::<Stream<'_>>(&ReaderContext::dummy())
444            .unwrap();
445
446        let decoded = stream.decoded().unwrap();
447        assert_eq!(decoded, b"hello");
448    }
449
450    /// `decoded()` returns `StreamTooLarge` when the decoded size exceeds the configured limit.
451    #[test]
452    fn decoded_exceeds_byte_limit() {
453        // 10-byte payload, limit of 5 bytes → should fail.
454        let data = b"<< /Length 10 >> stream\nabcdefghij\nendstream";
455        let limits = PdfLoadLimits::new().max_stream_bytes(5);
456        let ctx = ReaderContext::dummy_with_limits(limits);
457        let mut r = Reader::new(data);
458        let stream = r.read_with_context::<Stream<'_>>(&ctx).unwrap();
459
460        match stream.decoded() {
461            Err(DecodeFailure::StreamTooLarge { observed, limit }) => {
462                assert_eq!(observed, 10);
463                assert_eq!(limit, 5);
464            }
465            other => panic!("expected StreamTooLarge, got {other:?}"),
466        }
467    }
468
469    /// `decoded()` succeeds when the payload exactly equals the byte limit.
470    #[test]
471    fn decoded_at_byte_limit_succeeds() {
472        let data = b"<< /Length 10 >> stream\nabcdefghij\nendstream";
473        let limits = PdfLoadLimits::new().max_stream_bytes(10);
474        let ctx = ReaderContext::dummy_with_limits(limits);
475        let mut r = Reader::new(data);
476        let stream = r.read_with_context::<Stream<'_>>(&ctx).unwrap();
477
478        let decoded = stream.decoded().unwrap();
479        assert_eq!(decoded, b"abcdefghij");
480    }
481}