Skip to main content

pdf_objects/
stream.rs

1use std::io::{Read, Write};
2
3use flate2::Compression;
4use flate2::read::ZlibDecoder;
5use flate2::write::ZlibEncoder;
6
7use crate::error::{PdfError, PdfResult};
8use crate::types::{PdfStream, PdfValue};
9
10/// Compress `data` with FlateDecode (zlib / deflate) at the default
11/// compression level. Used by the writer when re-emitting rewritten content
12/// streams so the saved PDF does not bloat with plaintext content bytes.
13pub fn flate_encode(data: &[u8]) -> PdfResult<Vec<u8>> {
14    let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
15    encoder
16        .write_all(data)
17        .map_err(|error| PdfError::Corrupt(format!("flate encode failed: {error}")))?;
18    encoder
19        .finish()
20        .map_err(|error| PdfError::Corrupt(format!("flate encode finalize failed: {error}")))
21}
22
23pub fn decode_stream(stream: &PdfStream) -> PdfResult<Vec<u8>> {
24    let filter_names = normalize_filter_list(stream.dict.get("Filter"))?;
25    let mut decoded = stream.data.clone();
26    for filter_name in &filter_names {
27        decoded = apply_filter(filter_name, &decoded)?;
28    }
29    apply_predictor(&decoded, stream.dict.get("DecodeParms"))
30}
31
32/// Return the /Filter entry as an ordered list of filter names, whether
33/// the source dictionary uses the single-name shorthand or the array
34/// form. Empty list means no filters applied (raw data).
35fn normalize_filter_list(value: Option<&PdfValue>) -> PdfResult<Vec<String>> {
36    match value {
37        None => Ok(Vec::new()),
38        Some(PdfValue::Null) => Ok(Vec::new()),
39        Some(PdfValue::Name(name)) => Ok(vec![name.clone()]),
40        Some(PdfValue::Array(items)) => {
41            let mut names = Vec::with_capacity(items.len());
42            for item in items {
43                match item {
44                    PdfValue::Name(name) => names.push(name.clone()),
45                    _ => {
46                        return Err(PdfError::Corrupt(
47                            "stream /Filter array contains a non-name entry".to_string(),
48                        ));
49                    }
50                }
51            }
52            Ok(names)
53        }
54        Some(_) => Err(PdfError::Corrupt(
55            "stream /Filter is neither a name nor an array of names".to_string(),
56        )),
57    }
58}
59
60fn apply_filter(filter: &str, data: &[u8]) -> PdfResult<Vec<u8>> {
61    match filter {
62        "FlateDecode" | "Fl" => inflate(data),
63        "ASCII85Decode" | "A85" => ascii85_decode(data),
64        "ASCIIHexDecode" | "AHx" => ascii_hex_decode(data),
65        other => Err(PdfError::Unsupported(format!(
66            "stream filter /{other} is not supported"
67        ))),
68    }
69}
70
71/// Maximum decompressed stream size (256 MiB). Prevents decompression bombs from
72/// exhausting memory in WASM or native contexts.
73const MAX_DECOMPRESSED_SIZE: u64 = 256 * 1024 * 1024;
74
75fn inflate(data: &[u8]) -> PdfResult<Vec<u8>> {
76    let decoder = ZlibDecoder::new(data);
77    let mut output = Vec::new();
78    decoder
79        .take(MAX_DECOMPRESSED_SIZE + 1)
80        .read_to_end(&mut output)
81        .map_err(|error| PdfError::Corrupt(format!("failed to decode flate stream: {error}")))?;
82    if output.len() as u64 > MAX_DECOMPRESSED_SIZE {
83        return Err(PdfError::Corrupt(
84            "decompressed stream exceeds maximum allowed size".to_string(),
85        ));
86    }
87    Ok(output)
88}
89
90/// Decode an ASCII85-encoded byte run (PDF § 7.4.3). Whitespace is
91/// ignored, `z` expands to four zero bytes, and `~>` terminates the
92/// stream; a short final group is padded with `u` and the decoded
93/// tail is truncated accordingly.
94fn ascii85_decode(data: &[u8]) -> PdfResult<Vec<u8>> {
95    let mut output = Vec::with_capacity(data.len());
96    let mut group = [0u8; 5];
97    let mut group_len = 0usize;
98
99    for &byte in data {
100        if byte == b'~' {
101            break; // `~>` EOD marker; the `>` is allowed to follow or be absent.
102        }
103        if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C) {
104            continue;
105        }
106        if byte == b'z' {
107            if group_len != 0 {
108                return Err(PdfError::Corrupt(
109                    "ASCII85 'z' shortcut inside a partial group".to_string(),
110                ));
111            }
112            output.extend_from_slice(&[0u8; 4]);
113            continue;
114        }
115        if !(b'!'..=b'u').contains(&byte) {
116            return Err(PdfError::Corrupt(format!(
117                "invalid ASCII85 byte 0x{byte:02X}"
118            )));
119        }
120        group[group_len] = byte - b'!';
121        group_len += 1;
122        if group_len == 5 {
123            let value = (group[0] as u64) * 85u64.pow(4)
124                + (group[1] as u64) * 85u64.pow(3)
125                + (group[2] as u64) * 85u64.pow(2)
126                + (group[3] as u64) * 85
127                + (group[4] as u64);
128            if value > u32::MAX as u64 {
129                return Err(PdfError::Corrupt(
130                    "ASCII85 group value exceeds 32 bits".to_string(),
131                ));
132            }
133            output.extend_from_slice(&(value as u32).to_be_bytes());
134            group_len = 0;
135        }
136    }
137
138    if group_len > 0 {
139        if group_len == 1 {
140            return Err(PdfError::Corrupt(
141                "ASCII85 final group contains a single byte".to_string(),
142            ));
143        }
144        // Pad with the max digit so truncating yields the right tail.
145        for entry in group.iter_mut().skip(group_len) {
146            *entry = 84;
147        }
148        let value = (group[0] as u64) * 85u64.pow(4)
149            + (group[1] as u64) * 85u64.pow(3)
150            + (group[2] as u64) * 85u64.pow(2)
151            + (group[3] as u64) * 85
152            + (group[4] as u64);
153        let bytes = (value as u32).to_be_bytes();
154        output.extend_from_slice(&bytes[..group_len - 1]);
155    }
156
157    Ok(output)
158}
159
160/// Decode an ASCIIHex-encoded byte run (PDF § 7.4.2). Whitespace is
161/// ignored, `>` terminates the stream, and a trailing odd nibble is
162/// treated as if followed by `0`.
163fn ascii_hex_decode(data: &[u8]) -> PdfResult<Vec<u8>> {
164    let mut output = Vec::with_capacity(data.len() / 2 + 1);
165    let mut high: Option<u8> = None;
166    for &byte in data {
167        if byte == b'>' {
168            break;
169        }
170        if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C) {
171            continue;
172        }
173        let nibble = match byte {
174            b'0'..=b'9' => byte - b'0',
175            b'a'..=b'f' => byte - b'a' + 10,
176            b'A'..=b'F' => byte - b'A' + 10,
177            _ => {
178                return Err(PdfError::Corrupt(format!(
179                    "invalid ASCIIHex byte 0x{byte:02X}"
180                )));
181            }
182        };
183        match high.take() {
184            None => high = Some(nibble),
185            Some(h) => output.push((h << 4) | nibble),
186        }
187    }
188    if let Some(h) = high {
189        output.push(h << 4);
190    }
191    Ok(output)
192}
193
194fn apply_predictor(data: &[u8], decode_parms: Option<&PdfValue>) -> PdfResult<Vec<u8>> {
195    let parms = match decode_parms {
196        None => return Ok(data.to_vec()),
197        Some(PdfValue::Dictionary(dict)) => dict,
198        Some(PdfValue::Null) => return Ok(data.to_vec()),
199        Some(PdfValue::Array(_)) => {
200            // Per-filter DecodeParms arrays are legal when multiple filters are
201            // chained. We only support a single FlateDecode filter today so any
202            // array-valued DecodeParms is unexpected.
203            return Err(PdfError::Unsupported(
204                "per-filter DecodeParms arrays are not supported".to_string(),
205            ));
206        }
207        Some(_) => {
208            return Err(PdfError::Corrupt(
209                "DecodeParms is not a dictionary".to_string(),
210            ));
211        }
212    };
213
214    let predictor = parms
215        .get("Predictor")
216        .and_then(PdfValue::as_integer)
217        .unwrap_or(1);
218    match predictor {
219        1 => Ok(data.to_vec()),
220        2 => tiff_predictor_decode(data, parms),
221        10..=15 => png_predictor_decode(data, parms),
222        other => Err(PdfError::Unsupported(format!(
223            "predictor {other} is not supported"
224        ))),
225    }
226}
227
228fn tiff_predictor_decode(data: &[u8], parms: &crate::types::PdfDictionary) -> PdfResult<Vec<u8>> {
229    let columns = parms
230        .get("Columns")
231        .and_then(PdfValue::as_integer)
232        .unwrap_or(1) as usize;
233    let colors = parms
234        .get("Colors")
235        .and_then(PdfValue::as_integer)
236        .unwrap_or(1) as usize;
237    let bits_per_component = parms
238        .get("BitsPerComponent")
239        .and_then(PdfValue::as_integer)
240        .unwrap_or(8) as usize;
241
242    if bits_per_component != 8 {
243        return Err(PdfError::Unsupported(format!(
244            "TIFF predictor with BitsPerComponent {bits_per_component} is not supported"
245        )));
246    }
247    if columns == 0 || colors == 0 {
248        return Err(PdfError::Corrupt(
249            "TIFF predictor Columns/Colors must be positive".to_string(),
250        ));
251    }
252    let row_stride = columns * colors;
253    if data.len() % row_stride != 0 {
254        return Err(PdfError::Corrupt(format!(
255            "TIFF predictor row length mismatch: data={} stride={row_stride}",
256            data.len()
257        )));
258    }
259    let mut output = Vec::with_capacity(data.len());
260    for row in data.chunks_exact(row_stride) {
261        for (component_index, byte) in row.iter().enumerate() {
262            if component_index < colors {
263                // First pixel in a row is stored as-is per component.
264                output.push(*byte);
265            } else {
266                let previous = output[output.len() - colors];
267                output.push(previous.wrapping_add(*byte));
268            }
269        }
270    }
271    Ok(output)
272}
273
274fn png_predictor_decode(data: &[u8], parms: &crate::types::PdfDictionary) -> PdfResult<Vec<u8>> {
275    let columns = parms
276        .get("Columns")
277        .and_then(PdfValue::as_integer)
278        .unwrap_or(1) as usize;
279    let colors = parms
280        .get("Colors")
281        .and_then(PdfValue::as_integer)
282        .unwrap_or(1) as usize;
283    let bits_per_component = parms
284        .get("BitsPerComponent")
285        .and_then(PdfValue::as_integer)
286        .unwrap_or(8) as usize;
287
288    if bits_per_component != 8 {
289        return Err(PdfError::Unsupported(format!(
290            "PNG predictor with BitsPerComponent {bits_per_component} is not supported"
291        )));
292    }
293    if columns == 0 || colors == 0 {
294        return Err(PdfError::Corrupt(
295            "PNG predictor Columns/Colors must be positive".to_string(),
296        ));
297    }
298    let bytes_per_pixel = colors; // bits_per_component == 8
299    let row_data_len = columns * bytes_per_pixel;
300    let row_stride = row_data_len + 1; // leading filter byte
301
302    if data.len() % row_stride != 0 {
303        return Err(PdfError::Corrupt(format!(
304            "PNG predictor row length mismatch: data={} stride={row_stride}",
305            data.len()
306        )));
307    }
308    let row_count = data.len() / row_stride;
309    let mut output = Vec::with_capacity(row_count * row_data_len);
310    let mut prev_row = vec![0u8; row_data_len];
311    let mut row = vec![0u8; row_data_len];
312
313    for r in 0..row_count {
314        let base = r * row_stride;
315        let filter = data[base];
316        let src = &data[base + 1..base + row_stride];
317        row.copy_from_slice(src);
318        match filter {
319            0 => {} // None
320            1 => {
321                // Sub
322                for i in 0..row_data_len {
323                    let left = if i >= bytes_per_pixel {
324                        row[i - bytes_per_pixel]
325                    } else {
326                        0
327                    };
328                    row[i] = row[i].wrapping_add(left);
329                }
330            }
331            2 => {
332                // Up
333                for i in 0..row_data_len {
334                    row[i] = row[i].wrapping_add(prev_row[i]);
335                }
336            }
337            3 => {
338                // Average
339                for i in 0..row_data_len {
340                    let left = if i >= bytes_per_pixel {
341                        row[i - bytes_per_pixel]
342                    } else {
343                        0
344                    };
345                    let up = prev_row[i];
346                    let avg = ((left as u16 + up as u16) / 2) as u8;
347                    row[i] = row[i].wrapping_add(avg);
348                }
349            }
350            4 => {
351                // Paeth
352                for i in 0..row_data_len {
353                    let left = if i >= bytes_per_pixel {
354                        row[i - bytes_per_pixel]
355                    } else {
356                        0
357                    };
358                    let up = prev_row[i];
359                    let up_left = if i >= bytes_per_pixel {
360                        prev_row[i - bytes_per_pixel]
361                    } else {
362                        0
363                    };
364                    row[i] = row[i].wrapping_add(paeth(left, up, up_left));
365                }
366            }
367            other => {
368                return Err(PdfError::Corrupt(format!(
369                    "unknown PNG row filter type {other}"
370                )));
371            }
372        }
373        output.extend_from_slice(&row);
374        prev_row.copy_from_slice(&row);
375    }
376
377    Ok(output)
378}
379
380fn paeth(a: u8, b: u8, c: u8) -> u8 {
381    let p = a as i32 + b as i32 - c as i32;
382    let pa = (p - a as i32).abs();
383    let pb = (p - b as i32).abs();
384    let pc = (p - c as i32).abs();
385    if pa <= pb && pa <= pc {
386        a
387    } else if pb <= pc {
388        b
389    } else {
390        c
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397    use crate::types::{PdfDictionary, PdfStream, PdfValue};
398    use flate2::{Compression, write::ZlibEncoder};
399    use std::io::Write;
400
401    fn make_stream(dict: PdfDictionary, data: Vec<u8>) -> PdfStream {
402        PdfStream { dict, data }
403    }
404
405    #[test]
406    fn passthrough_when_no_filter() {
407        let dict = PdfDictionary::new();
408        let stream = make_stream(dict, vec![1, 2, 3]);
409        assert_eq!(decode_stream(&stream).unwrap(), vec![1, 2, 3]);
410    }
411
412    #[test]
413    fn inflates_flate_decode() {
414        let raw = b"hello world";
415        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
416        encoder.write_all(raw).unwrap();
417        let compressed = encoder.finish().unwrap();
418        let mut dict = PdfDictionary::new();
419        dict.insert("Filter".to_string(), PdfValue::Name("FlateDecode".into()));
420        let stream = make_stream(dict, compressed);
421        assert_eq!(decode_stream(&stream).unwrap(), raw.to_vec());
422    }
423
424    #[test]
425    fn applies_png_up_predictor() {
426        // Original 2 rows of 4 bytes each.
427        let original: [u8; 8] = [10, 20, 30, 40, 15, 22, 33, 44];
428
429        // Encode with filter type 2 (Up) on row 2, type 0 on row 1.
430        let mut encoded = Vec::new();
431        encoded.push(0); // row 0: None
432        encoded.extend_from_slice(&original[0..4]);
433        encoded.push(2); // row 1: Up
434        let diff: Vec<u8> = original[4..8]
435            .iter()
436            .zip(original[0..4].iter())
437            .map(|(v, up)| v.wrapping_sub(*up))
438            .collect();
439        encoded.extend_from_slice(&diff);
440
441        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
442        encoder.write_all(&encoded).unwrap();
443        let compressed = encoder.finish().unwrap();
444
445        let mut dict = PdfDictionary::new();
446        dict.insert("Filter".to_string(), PdfValue::Name("FlateDecode".into()));
447        let mut parms = PdfDictionary::new();
448        parms.insert("Predictor".to_string(), PdfValue::Integer(12));
449        parms.insert("Columns".to_string(), PdfValue::Integer(4));
450        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
451
452        let stream = make_stream(dict, compressed);
453        let decoded = decode_stream(&stream).expect("decode");
454        assert_eq!(decoded, original.to_vec());
455    }
456
457    #[test]
458    fn applies_tiff_predictor() {
459        // Original 2 rows of 4 bytes each, 1 color, 8 bits per component.
460        let original: [u8; 8] = [10, 20, 30, 40, 15, 22, 33, 44];
461
462        // TIFF predictor encodes each row independently: first byte as-is,
463        // subsequent bytes as (current - previous). No filter byte prefix.
464        let mut encoded = Vec::new();
465        for row in original.chunks(4) {
466            encoded.push(row[0]);
467            for index in 1..row.len() {
468                encoded.push(row[index].wrapping_sub(row[index - 1]));
469            }
470        }
471
472        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
473        encoder.write_all(&encoded).unwrap();
474        let compressed = encoder.finish().unwrap();
475
476        let mut dict = PdfDictionary::new();
477        dict.insert("Filter".to_string(), PdfValue::Name("FlateDecode".into()));
478        let mut parms = PdfDictionary::new();
479        parms.insert("Predictor".to_string(), PdfValue::Integer(2));
480        parms.insert("Columns".to_string(), PdfValue::Integer(4));
481        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
482
483        let stream = make_stream(dict, compressed);
484        let decoded = decode_stream(&stream).expect("decode");
485        assert_eq!(decoded, original.to_vec());
486    }
487
488    #[test]
489    fn decodes_ascii85_full_group() {
490        // Full 4-byte group "Man " → ASCII85 "9jqo^".
491        let encoded = b"9jqo^~>".to_vec();
492        let mut dict = PdfDictionary::new();
493        dict.insert("Filter".to_string(), PdfValue::Name("ASCII85Decode".into()));
494        let stream = make_stream(dict, encoded);
495        assert_eq!(decode_stream(&stream).unwrap(), b"Man ".to_vec());
496    }
497
498    #[test]
499    fn decodes_ascii85_z_shortcut() {
500        let encoded = b"z~>".to_vec();
501        let mut dict = PdfDictionary::new();
502        dict.insert("Filter".to_string(), PdfValue::Name("ASCII85Decode".into()));
503        let stream = make_stream(dict, encoded);
504        assert_eq!(decode_stream(&stream).unwrap(), vec![0, 0, 0, 0]);
505    }
506
507    #[test]
508    fn decodes_filter_chain_ascii85_then_flate() {
509        // Encode plaintext with FlateDecode first, then ASCII85 wrap. The
510        // order the filter list uses is the DECODE order, so reading the
511        // stream applies ASCII85 first and FlateDecode second — the same
512        // order we use to produce the bytes in reverse.
513        let plaintext = b"PdfStreamFilterChainTest".to_vec();
514        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
515        encoder.write_all(&plaintext).unwrap();
516        let flate_bytes = encoder.finish().unwrap();
517
518        // ASCII85 encode the FlateDecode payload.
519        let mut ascii85 = String::new();
520        for chunk in flate_bytes.chunks(4) {
521            let mut buf = [0u8; 4];
522            buf[..chunk.len()].copy_from_slice(chunk);
523            let value = u32::from_be_bytes(buf);
524            if chunk.len() == 4 && value == 0 {
525                ascii85.push('z');
526                continue;
527            }
528            let mut digits = [0u8; 5];
529            let mut v = value as u64;
530            for i in (0..5).rev() {
531                digits[i] = (v % 85) as u8 + b'!';
532                v /= 85;
533            }
534            let take = chunk.len() + 1;
535            for &digit in &digits[..take] {
536                ascii85.push(digit as char);
537            }
538        }
539        ascii85.push_str("~>");
540
541        let mut dict = PdfDictionary::new();
542        dict.insert(
543            "Filter".to_string(),
544            PdfValue::Array(vec![
545                PdfValue::Name("ASCII85Decode".into()),
546                PdfValue::Name("FlateDecode".into()),
547            ]),
548        );
549        let stream = make_stream(dict, ascii85.into_bytes());
550        assert_eq!(decode_stream(&stream).unwrap(), plaintext);
551    }
552
553    #[test]
554    fn decodes_ascii_hex() {
555        let encoded = b"48656C6C6F>".to_vec();
556        let mut dict = PdfDictionary::new();
557        dict.insert(
558            "Filter".to_string(),
559            PdfValue::Name("ASCIIHexDecode".into()),
560        );
561        let stream = make_stream(dict, encoded);
562        assert_eq!(decode_stream(&stream).unwrap(), b"Hello".to_vec());
563    }
564
565    #[test]
566    fn rejects_unsupported_predictor() {
567        let mut dict = PdfDictionary::new();
568        let mut parms = PdfDictionary::new();
569        parms.insert("Predictor".to_string(), PdfValue::Integer(3));
570        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
571        let stream = make_stream(dict, vec![0, 0, 0, 0]);
572        match decode_stream(&stream) {
573            Err(PdfError::Unsupported(msg)) => {
574                assert!(msg.contains("predictor"), "got: {msg}")
575            }
576            other => panic!("expected Unsupported, got: {other:?}"),
577        }
578    }
579}