pdf_objects/
stream.rs

1use std::io::{Read, Write};
2
3use flate2::Compression;
4use flate2::read::ZlibDecoder;
5use flate2::write::ZlibEncoder;
6
7use crate::error::{PdfError, PdfResult};
8use crate::types::{PdfStream, PdfValue};
9
10/// Compress `data` with FlateDecode (zlib / deflate) at the default
11/// compression level. Used by the writer when re-emitting rewritten content
12/// streams so the saved PDF does not bloat with plaintext content bytes.
13pub fn flate_encode(data: &[u8]) -> PdfResult<Vec<u8>> {
14    let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
15    encoder
16        .write_all(data)
17        .map_err(|error| PdfError::Corrupt(format!("flate encode failed: {error}")))?;
18    encoder
19        .finish()
20        .map_err(|error| PdfError::Corrupt(format!("flate encode finalize failed: {error}")))
21}
22
23pub fn decode_stream(stream: &PdfStream) -> PdfResult<Vec<u8>> {
24    let filter_names = normalize_filter_list(stream.dict.get("Filter"))?;
25    let decode_parms = stream.dict.get("DecodeParms");
26    let mut decoded = stream.data.clone();
27    for (index, filter_name) in filter_names.iter().enumerate() {
28        let is_last = index + 1 == filter_names.len();
29        decoded = match filter_name.as_str() {
30            // LZW needs `DecodeParms /EarlyChange`; every other filter ignores
31            // DecodeParms here because predictors are applied after the chain.
32            "LZWDecode" | "LZW" => {
33                let early_change = if is_last {
34                    lzw_early_change(decode_parms)?
35                } else {
36                    true
37                };
38                lzw_decode(&decoded, early_change)?
39            }
40            _ => apply_filter(filter_name, &decoded)?,
41        };
42    }
43    apply_predictor(&decoded, decode_parms)
44}
45
46/// Return the /Filter entry as an ordered list of filter names, whether
47/// the source dictionary uses the single-name shorthand or the array
48/// form. Empty list means no filters applied (raw data).
49fn normalize_filter_list(value: Option<&PdfValue>) -> PdfResult<Vec<String>> {
50    match value {
51        None => Ok(Vec::new()),
52        Some(PdfValue::Null) => Ok(Vec::new()),
53        Some(PdfValue::Name(name)) => Ok(vec![name.clone()]),
54        Some(PdfValue::Array(items)) => {
55            let mut names = Vec::with_capacity(items.len());
56            for item in items {
57                match item {
58                    PdfValue::Name(name) => names.push(name.clone()),
59                    _ => {
60                        return Err(PdfError::Corrupt(
61                            "stream /Filter array contains a non-name entry".to_string(),
62                        ));
63                    }
64                }
65            }
66            Ok(names)
67        }
68        Some(_) => Err(PdfError::Corrupt(
69            "stream /Filter is neither a name nor an array of names".to_string(),
70        )),
71    }
72}
73
74fn apply_filter(filter: &str, data: &[u8]) -> PdfResult<Vec<u8>> {
75    match filter {
76        "FlateDecode" | "Fl" => inflate(data),
77        "ASCII85Decode" | "A85" => ascii85_decode(data),
78        "ASCIIHexDecode" | "AHx" => ascii_hex_decode(data),
79        "LZWDecode" | "LZW" => lzw_decode(data, true),
80        "RunLengthDecode" | "RL" => run_length_decode(data),
81        other => Err(PdfError::Unsupported(format!(
82            "stream filter /{other} is not supported"
83        ))),
84    }
85}
86
87/// Read the `DecodeParms /EarlyChange` flag for an LZW stream. PDF
88/// defaults to `1` when the entry is missing; `0` disables the
89/// one-code-early width switch that TIFF-flavoured LZW implementations
90/// use. Any other value is rejected as corrupt so we never silently
91/// misalign on an unknown flag.
92fn lzw_early_change(decode_parms: Option<&PdfValue>) -> PdfResult<bool> {
93    let Some(value) = decode_parms else {
94        return Ok(true);
95    };
96    let dict = match value {
97        PdfValue::Dictionary(dict) => dict,
98        PdfValue::Null => return Ok(true),
99        PdfValue::Array(_) => {
100            return Err(PdfError::Unsupported(
101                "per-filter DecodeParms arrays are not supported".to_string(),
102            ));
103        }
104        _ => {
105            return Err(PdfError::Corrupt(
106                "DecodeParms is not a dictionary".to_string(),
107            ));
108        }
109    };
110    match dict.get("EarlyChange").and_then(PdfValue::as_integer) {
111        None => Ok(true),
112        Some(1) => Ok(true),
113        Some(0) => Ok(false),
114        Some(other) => Err(PdfError::Corrupt(format!(
115            "unsupported LZW EarlyChange value {other}"
116        ))),
117    }
118}
119
120/// Maximum decompressed stream size (256 MiB). Prevents decompression bombs from
121/// exhausting memory in WASM or native contexts.
122const MAX_DECOMPRESSED_SIZE: u64 = 256 * 1024 * 1024;
123
124fn inflate(data: &[u8]) -> PdfResult<Vec<u8>> {
125    let decoder = ZlibDecoder::new(data);
126    let mut output = Vec::new();
127    decoder
128        .take(MAX_DECOMPRESSED_SIZE + 1)
129        .read_to_end(&mut output)
130        .map_err(|error| PdfError::Corrupt(format!("failed to decode flate stream: {error}")))?;
131    if output.len() as u64 > MAX_DECOMPRESSED_SIZE {
132        return Err(PdfError::Corrupt(
133            "decompressed stream exceeds maximum allowed size".to_string(),
134        ));
135    }
136    Ok(output)
137}
138
139/// Decode an LZW-encoded byte run (PDF § 7.4.4). Uses the TIFF-compatible
140/// variable-width code flavour: 9–12-bit codes, 256 = CLEAR, 257 = EOD,
141/// the literal dictionary seeds indices 0–255, and growth starts at 258.
142/// `early_change` mirrors `DecodeParms /EarlyChange` — `true` (the PDF
143/// default) switches to a wider code one entry before the dictionary is
144/// fully populated at the current width.
145fn lzw_decode(data: &[u8], early_change: bool) -> PdfResult<Vec<u8>> {
146    const CLEAR: u32 = 256;
147    const EOD: u32 = 257;
148    const MAX_WIDTH: u32 = 12;
149    let width_threshold = |width: u32| {
150        if early_change {
151            (1u32 << width) - 1
152        } else {
153            1u32 << width
154        }
155    };
156
157    let mut reader = BitReader::new(data);
158    let mut dict: Vec<Vec<u8>> = Vec::with_capacity(1 << MAX_WIDTH);
159    let reset_dict = |dict: &mut Vec<Vec<u8>>| {
160        dict.clear();
161        for byte in 0u32..256 {
162            dict.push(vec![byte as u8]);
163        }
164        dict.push(Vec::new()); // 256 — placeholder for CLEAR
165        dict.push(Vec::new()); // 257 — placeholder for EOD
166    };
167    reset_dict(&mut dict);
168
169    let mut output: Vec<u8> = Vec::new();
170    let mut code_width: u32 = 9;
171    let mut previous: Option<Vec<u8>> = None;
172    loop {
173        let Some(code) = reader.read_bits(code_width) else {
174            break;
175        };
176        if code == EOD {
177            break;
178        }
179        if code == CLEAR {
180            reset_dict(&mut dict);
181            code_width = 9;
182            previous = None;
183            continue;
184        }
185        let entry = if (code as usize) < dict.len() {
186            let entry = dict[code as usize].clone();
187            if entry.is_empty() {
188                return Err(PdfError::Corrupt(format!(
189                    "LZW code {code} references placeholder entry"
190                )));
191            }
192            entry
193        } else if code as usize == dict.len() {
194            // Standard LZW K+K[0] special case: the code points at the
195            // entry we are about to add, so reconstruct it from the
196            // previous entry plus its own first byte.
197            let prev = previous
198                .clone()
199                .ok_or_else(|| PdfError::Corrupt("LZW code out of sequence".to_string()))?;
200            let first = *prev
201                .first()
202                .ok_or_else(|| PdfError::Corrupt("LZW previous entry was empty".to_string()))?;
203            let mut entry = prev;
204            entry.push(first);
205            entry
206        } else {
207            return Err(PdfError::Corrupt(format!(
208                "LZW code {code} outside dictionary"
209            )));
210        };
211        if output.len() + entry.len() > MAX_DECOMPRESSED_SIZE as usize {
212            return Err(PdfError::Corrupt(
213                "decompressed stream exceeds maximum allowed size".to_string(),
214            ));
215        }
216        output.extend_from_slice(&entry);
217        if let Some(prev_entry) = previous.take() {
218            let mut new_entry = prev_entry;
219            new_entry.push(entry[0]);
220            if dict.len() < (1 << MAX_WIDTH) {
221                dict.push(new_entry);
222            }
223            // The encoder bumps width against `next_code` (the index of the
224            // slot just filled, i.e. `dict.len()` here) AFTER the insert.
225            // The decoder trails the encoder by one dictionary entry — the
226            // push for code N happens while processing code N+1 — so the
227            // decoder has to compare against `dict.len() + 1` to bump width
228            // at the same boundary the encoder did.
229            if (dict.len() as u32).saturating_add(1) >= width_threshold(code_width)
230                && code_width < MAX_WIDTH
231            {
232                code_width += 1;
233            }
234        }
235        previous = Some(entry);
236    }
237    Ok(output)
238}
239
240/// MSB-first bit reader used by the LZW decoder. The PDF spec § 7.4.4.2
241/// states codes are packed "with the high-order bit of each code
242/// appearing first", so bytes are consumed from the front of the stream
243/// and codes are shifted out from the top of an accumulating buffer.
244/// When the backing data runs out mid-code, the remaining bits are
245/// zero-padded — matching the encoder contract in § 7.4.4.3.
246struct BitReader<'a> {
247    data: &'a [u8],
248    byte_index: usize,
249    bit_buffer: u32,
250    bit_count: u32,
251}
252
253impl<'a> BitReader<'a> {
254    fn new(data: &'a [u8]) -> Self {
255        BitReader {
256            data,
257            byte_index: 0,
258            bit_buffer: 0,
259            bit_count: 0,
260        }
261    }
262
263    fn read_bits(&mut self, width: u32) -> Option<u32> {
264        while self.bit_count < width {
265            if self.byte_index >= self.data.len() {
266                if self.bit_count == 0 {
267                    return None;
268                }
269                // Pad with zero bits to flush the final partial code.
270                let pad = width - self.bit_count;
271                self.bit_buffer <<= pad;
272                let mask = (1u32 << width) - 1;
273                let code = self.bit_buffer & mask;
274                self.bit_count = 0;
275                self.bit_buffer = 0;
276                return Some(code);
277            }
278            self.bit_buffer = (self.bit_buffer << 8) | u32::from(self.data[self.byte_index]);
279            self.byte_index += 1;
280            self.bit_count += 8;
281        }
282        self.bit_count -= width;
283        let mask = (1u32 << width) - 1;
284        let code = (self.bit_buffer >> self.bit_count) & mask;
285        self.bit_buffer &= (1u32 << self.bit_count) - 1;
286        Some(code)
287    }
288}
289
290/// Decode an ASCII85-encoded byte run (PDF § 7.4.3). Whitespace is
291/// ignored, `z` expands to four zero bytes, and `~>` terminates the
292/// stream; a short final group is padded with `u` and the decoded
293/// tail is truncated accordingly.
294fn ascii85_decode(data: &[u8]) -> PdfResult<Vec<u8>> {
295    let mut output = Vec::with_capacity(data.len());
296    let mut group = [0u8; 5];
297    let mut group_len = 0usize;
298
299    for &byte in data {
300        if byte == b'~' {
301            break; // `~>` EOD marker; the `>` is allowed to follow or be absent.
302        }
303        if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C) {
304            continue;
305        }
306        if byte == b'z' {
307            if group_len != 0 {
308                return Err(PdfError::Corrupt(
309                    "ASCII85 'z' shortcut inside a partial group".to_string(),
310                ));
311            }
312            output.extend_from_slice(&[0u8; 4]);
313            continue;
314        }
315        if !(b'!'..=b'u').contains(&byte) {
316            return Err(PdfError::Corrupt(format!(
317                "invalid ASCII85 byte 0x{byte:02X}"
318            )));
319        }
320        group[group_len] = byte - b'!';
321        group_len += 1;
322        if group_len == 5 {
323            let value = (group[0] as u64) * 85u64.pow(4)
324                + (group[1] as u64) * 85u64.pow(3)
325                + (group[2] as u64) * 85u64.pow(2)
326                + (group[3] as u64) * 85
327                + (group[4] as u64);
328            if value > u32::MAX as u64 {
329                return Err(PdfError::Corrupt(
330                    "ASCII85 group value exceeds 32 bits".to_string(),
331                ));
332            }
333            output.extend_from_slice(&(value as u32).to_be_bytes());
334            group_len = 0;
335        }
336    }
337
338    if group_len > 0 {
339        if group_len == 1 {
340            return Err(PdfError::Corrupt(
341                "ASCII85 final group contains a single byte".to_string(),
342            ));
343        }
344        // Pad with the max digit so truncating yields the right tail.
345        for entry in group.iter_mut().skip(group_len) {
346            *entry = 84;
347        }
348        let value = (group[0] as u64) * 85u64.pow(4)
349            + (group[1] as u64) * 85u64.pow(3)
350            + (group[2] as u64) * 85u64.pow(2)
351            + (group[3] as u64) * 85
352            + (group[4] as u64);
353        let bytes = (value as u32).to_be_bytes();
354        output.extend_from_slice(&bytes[..group_len - 1]);
355    }
356
357    Ok(output)
358}
359
360/// Decode an ASCIIHex-encoded byte run (PDF § 7.4.2). Whitespace is
361/// ignored, `>` terminates the stream, and a trailing odd nibble is
362/// treated as if followed by `0`.
363fn ascii_hex_decode(data: &[u8]) -> PdfResult<Vec<u8>> {
364    let mut output = Vec::with_capacity(data.len() / 2 + 1);
365    let mut high: Option<u8> = None;
366    for &byte in data {
367        if byte == b'>' {
368            break;
369        }
370        if matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C) {
371            continue;
372        }
373        let nibble = match byte {
374            b'0'..=b'9' => byte - b'0',
375            b'a'..=b'f' => byte - b'a' + 10,
376            b'A'..=b'F' => byte - b'A' + 10,
377            _ => {
378                return Err(PdfError::Corrupt(format!(
379                    "invalid ASCIIHex byte 0x{byte:02X}"
380                )));
381            }
382        };
383        match high.take() {
384            None => high = Some(nibble),
385            Some(h) => output.push((h << 4) | nibble),
386        }
387    }
388    if let Some(h) = high {
389        output.push(h << 4);
390    }
391    Ok(output)
392}
393
394/// Decode a RunLengthDecode byte run (PDF § 7.4.5). Each control byte `L`
395/// either introduces a literal run (`0..=127` → copy `L+1` bytes), a
396/// repeated byte (`129..=255` → repeat the next byte `257-L` times), or
397/// the end-of-data marker (`128`). A stream that ends before the EOD
398/// marker is accepted — some producers omit it — but truncated literal
399/// or repeat runs are treated as corruption.
400fn run_length_decode(data: &[u8]) -> PdfResult<Vec<u8>> {
401    let mut output: Vec<u8> = Vec::with_capacity(data.len());
402    let mut index = 0usize;
403    while index < data.len() {
404        let length_byte = data[index];
405        index += 1;
406        if length_byte == 128 {
407            return Ok(output);
408        }
409        if length_byte < 128 {
410            let run_len = usize::from(length_byte) + 1;
411            let end = index
412                .checked_add(run_len)
413                .ok_or_else(|| PdfError::Corrupt("RunLengthDecode index overflow".to_string()))?;
414            if end > data.len() {
415                return Err(PdfError::Corrupt(
416                    "RunLengthDecode literal run runs past end of stream".to_string(),
417                ));
418            }
419            output.extend_from_slice(&data[index..end]);
420            index = end;
421        } else {
422            let repeat = 257usize - usize::from(length_byte);
423            if index >= data.len() {
424                return Err(PdfError::Corrupt(
425                    "RunLengthDecode repeat run is missing its payload byte".to_string(),
426                ));
427            }
428            let byte = data[index];
429            index += 1;
430            output.extend(std::iter::repeat_n(byte, repeat));
431        }
432        if output.len() as u64 > MAX_DECOMPRESSED_SIZE {
433            return Err(PdfError::Corrupt(
434                "decompressed stream exceeds maximum allowed size".to_string(),
435            ));
436        }
437    }
438    Ok(output)
439}
440
441fn apply_predictor(data: &[u8], decode_parms: Option<&PdfValue>) -> PdfResult<Vec<u8>> {
442    let parms = match decode_parms {
443        None => return Ok(data.to_vec()),
444        Some(PdfValue::Dictionary(dict)) => dict,
445        Some(PdfValue::Null) => return Ok(data.to_vec()),
446        Some(PdfValue::Array(_)) => {
447            // Per-filter DecodeParms arrays are legal when multiple filters are
448            // chained. We only support a single FlateDecode filter today so any
449            // array-valued DecodeParms is unexpected.
450            return Err(PdfError::Unsupported(
451                "per-filter DecodeParms arrays are not supported".to_string(),
452            ));
453        }
454        Some(_) => {
455            return Err(PdfError::Corrupt(
456                "DecodeParms is not a dictionary".to_string(),
457            ));
458        }
459    };
460
461    let predictor = parms
462        .get("Predictor")
463        .and_then(PdfValue::as_integer)
464        .unwrap_or(1);
465    match predictor {
466        1 => Ok(data.to_vec()),
467        2 => tiff_predictor_decode(data, parms),
468        10..=15 => png_predictor_decode(data, parms),
469        other => Err(PdfError::Unsupported(format!(
470            "predictor {other} is not supported"
471        ))),
472    }
473}
474
475fn tiff_predictor_decode(data: &[u8], parms: &crate::types::PdfDictionary) -> PdfResult<Vec<u8>> {
476    let columns = parms
477        .get("Columns")
478        .and_then(PdfValue::as_integer)
479        .unwrap_or(1) as usize;
480    let colors = parms
481        .get("Colors")
482        .and_then(PdfValue::as_integer)
483        .unwrap_or(1) as usize;
484    let bits_per_component = parms
485        .get("BitsPerComponent")
486        .and_then(PdfValue::as_integer)
487        .unwrap_or(8) as usize;
488
489    if bits_per_component != 8 {
490        return Err(PdfError::Unsupported(format!(
491            "TIFF predictor with BitsPerComponent {bits_per_component} is not supported"
492        )));
493    }
494    if columns == 0 || colors == 0 {
495        return Err(PdfError::Corrupt(
496            "TIFF predictor Columns/Colors must be positive".to_string(),
497        ));
498    }
499    let row_stride = columns * colors;
500    if data.len() % row_stride != 0 {
501        return Err(PdfError::Corrupt(format!(
502            "TIFF predictor row length mismatch: data={} stride={row_stride}",
503            data.len()
504        )));
505    }
506    let mut output = Vec::with_capacity(data.len());
507    for row in data.chunks_exact(row_stride) {
508        for (component_index, byte) in row.iter().enumerate() {
509            if component_index < colors {
510                // First pixel in a row is stored as-is per component.
511                output.push(*byte);
512            } else {
513                let previous = output[output.len() - colors];
514                output.push(previous.wrapping_add(*byte));
515            }
516        }
517    }
518    Ok(output)
519}
520
521fn png_predictor_decode(data: &[u8], parms: &crate::types::PdfDictionary) -> PdfResult<Vec<u8>> {
522    let columns = parms
523        .get("Columns")
524        .and_then(PdfValue::as_integer)
525        .unwrap_or(1) as usize;
526    let colors = parms
527        .get("Colors")
528        .and_then(PdfValue::as_integer)
529        .unwrap_or(1) as usize;
530    let bits_per_component = parms
531        .get("BitsPerComponent")
532        .and_then(PdfValue::as_integer)
533        .unwrap_or(8) as usize;
534
535    if bits_per_component != 8 {
536        return Err(PdfError::Unsupported(format!(
537            "PNG predictor with BitsPerComponent {bits_per_component} is not supported"
538        )));
539    }
540    if columns == 0 || colors == 0 {
541        return Err(PdfError::Corrupt(
542            "PNG predictor Columns/Colors must be positive".to_string(),
543        ));
544    }
545    let bytes_per_pixel = colors; // bits_per_component == 8
546    let row_data_len = columns * bytes_per_pixel;
547    let row_stride = row_data_len + 1; // leading filter byte
548
549    if data.len() % row_stride != 0 {
550        return Err(PdfError::Corrupt(format!(
551            "PNG predictor row length mismatch: data={} stride={row_stride}",
552            data.len()
553        )));
554    }
555    let row_count = data.len() / row_stride;
556    let mut output = Vec::with_capacity(row_count * row_data_len);
557    let mut prev_row = vec![0u8; row_data_len];
558    let mut row = vec![0u8; row_data_len];
559
560    for r in 0..row_count {
561        let base = r * row_stride;
562        let filter = data[base];
563        let src = &data[base + 1..base + row_stride];
564        row.copy_from_slice(src);
565        match filter {
566            0 => {} // None
567            1 => {
568                // Sub
569                for i in 0..row_data_len {
570                    let left = if i >= bytes_per_pixel {
571                        row[i - bytes_per_pixel]
572                    } else {
573                        0
574                    };
575                    row[i] = row[i].wrapping_add(left);
576                }
577            }
578            2 => {
579                // Up
580                for i in 0..row_data_len {
581                    row[i] = row[i].wrapping_add(prev_row[i]);
582                }
583            }
584            3 => {
585                // Average
586                for i in 0..row_data_len {
587                    let left = if i >= bytes_per_pixel {
588                        row[i - bytes_per_pixel]
589                    } else {
590                        0
591                    };
592                    let up = prev_row[i];
593                    let avg = ((left as u16 + up as u16) / 2) as u8;
594                    row[i] = row[i].wrapping_add(avg);
595                }
596            }
597            4 => {
598                // Paeth
599                for i in 0..row_data_len {
600                    let left = if i >= bytes_per_pixel {
601                        row[i - bytes_per_pixel]
602                    } else {
603                        0
604                    };
605                    let up = prev_row[i];
606                    let up_left = if i >= bytes_per_pixel {
607                        prev_row[i - bytes_per_pixel]
608                    } else {
609                        0
610                    };
611                    row[i] = row[i].wrapping_add(paeth(left, up, up_left));
612                }
613            }
614            other => {
615                return Err(PdfError::Corrupt(format!(
616                    "unknown PNG row filter type {other}"
617                )));
618            }
619        }
620        output.extend_from_slice(&row);
621        prev_row.copy_from_slice(&row);
622    }
623
624    Ok(output)
625}
626
627fn paeth(a: u8, b: u8, c: u8) -> u8 {
628    let p = a as i32 + b as i32 - c as i32;
629    let pa = (p - a as i32).abs();
630    let pb = (p - b as i32).abs();
631    let pc = (p - c as i32).abs();
632    if pa <= pb && pa <= pc {
633        a
634    } else if pb <= pc {
635        b
636    } else {
637        c
638    }
639}
640
641#[cfg(test)]
642mod tests {
643    use super::*;
644    use crate::types::{PdfDictionary, PdfStream, PdfValue};
645    use flate2::{Compression, write::ZlibEncoder};
646    use std::io::Write;
647
648    fn make_stream(dict: PdfDictionary, data: Vec<u8>) -> PdfStream {
649        PdfStream { dict, data }
650    }
651
652    #[test]
653    fn passthrough_when_no_filter() {
654        let dict = PdfDictionary::new();
655        let stream = make_stream(dict, vec![1, 2, 3]);
656        assert_eq!(decode_stream(&stream).unwrap(), vec![1, 2, 3]);
657    }
658
659    #[test]
660    fn inflates_flate_decode() {
661        let raw = b"hello world";
662        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
663        encoder.write_all(raw).unwrap();
664        let compressed = encoder.finish().unwrap();
665        let mut dict = PdfDictionary::new();
666        dict.insert("Filter".to_string(), PdfValue::Name("FlateDecode".into()));
667        let stream = make_stream(dict, compressed);
668        assert_eq!(decode_stream(&stream).unwrap(), raw.to_vec());
669    }
670
671    #[test]
672    fn applies_png_up_predictor() {
673        // Original 2 rows of 4 bytes each.
674        let original: [u8; 8] = [10, 20, 30, 40, 15, 22, 33, 44];
675
676        // Encode with filter type 2 (Up) on row 2, type 0 on row 1.
677        let mut encoded = Vec::new();
678        encoded.push(0); // row 0: None
679        encoded.extend_from_slice(&original[0..4]);
680        encoded.push(2); // row 1: Up
681        let diff: Vec<u8> = original[4..8]
682            .iter()
683            .zip(original[0..4].iter())
684            .map(|(v, up)| v.wrapping_sub(*up))
685            .collect();
686        encoded.extend_from_slice(&diff);
687
688        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
689        encoder.write_all(&encoded).unwrap();
690        let compressed = encoder.finish().unwrap();
691
692        let mut dict = PdfDictionary::new();
693        dict.insert("Filter".to_string(), PdfValue::Name("FlateDecode".into()));
694        let mut parms = PdfDictionary::new();
695        parms.insert("Predictor".to_string(), PdfValue::Integer(12));
696        parms.insert("Columns".to_string(), PdfValue::Integer(4));
697        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
698
699        let stream = make_stream(dict, compressed);
700        let decoded = decode_stream(&stream).expect("decode");
701        assert_eq!(decoded, original.to_vec());
702    }
703
704    #[test]
705    fn applies_tiff_predictor() {
706        // Original 2 rows of 4 bytes each, 1 color, 8 bits per component.
707        let original: [u8; 8] = [10, 20, 30, 40, 15, 22, 33, 44];
708
709        // TIFF predictor encodes each row independently: first byte as-is,
710        // subsequent bytes as (current - previous). No filter byte prefix.
711        let mut encoded = Vec::new();
712        for row in original.chunks(4) {
713            encoded.push(row[0]);
714            for index in 1..row.len() {
715                encoded.push(row[index].wrapping_sub(row[index - 1]));
716            }
717        }
718
719        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
720        encoder.write_all(&encoded).unwrap();
721        let compressed = encoder.finish().unwrap();
722
723        let mut dict = PdfDictionary::new();
724        dict.insert("Filter".to_string(), PdfValue::Name("FlateDecode".into()));
725        let mut parms = PdfDictionary::new();
726        parms.insert("Predictor".to_string(), PdfValue::Integer(2));
727        parms.insert("Columns".to_string(), PdfValue::Integer(4));
728        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
729
730        let stream = make_stream(dict, compressed);
731        let decoded = decode_stream(&stream).expect("decode");
732        assert_eq!(decoded, original.to_vec());
733    }
734
735    #[test]
736    fn decodes_ascii85_full_group() {
737        // Full 4-byte group "Man " → ASCII85 "9jqo^".
738        let encoded = b"9jqo^~>".to_vec();
739        let mut dict = PdfDictionary::new();
740        dict.insert("Filter".to_string(), PdfValue::Name("ASCII85Decode".into()));
741        let stream = make_stream(dict, encoded);
742        assert_eq!(decode_stream(&stream).unwrap(), b"Man ".to_vec());
743    }
744
745    #[test]
746    fn decodes_ascii85_z_shortcut() {
747        let encoded = b"z~>".to_vec();
748        let mut dict = PdfDictionary::new();
749        dict.insert("Filter".to_string(), PdfValue::Name("ASCII85Decode".into()));
750        let stream = make_stream(dict, encoded);
751        assert_eq!(decode_stream(&stream).unwrap(), vec![0, 0, 0, 0]);
752    }
753
754    #[test]
755    fn decodes_filter_chain_ascii85_then_flate() {
756        // Encode plaintext with FlateDecode first, then ASCII85 wrap. The
757        // order the filter list uses is the DECODE order, so reading the
758        // stream applies ASCII85 first and FlateDecode second — the same
759        // order we use to produce the bytes in reverse.
760        let plaintext = b"PdfStreamFilterChainTest".to_vec();
761        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
762        encoder.write_all(&plaintext).unwrap();
763        let flate_bytes = encoder.finish().unwrap();
764
765        // ASCII85 encode the FlateDecode payload.
766        let mut ascii85 = String::new();
767        for chunk in flate_bytes.chunks(4) {
768            let mut buf = [0u8; 4];
769            buf[..chunk.len()].copy_from_slice(chunk);
770            let value = u32::from_be_bytes(buf);
771            if chunk.len() == 4 && value == 0 {
772                ascii85.push('z');
773                continue;
774            }
775            let mut digits = [0u8; 5];
776            let mut v = value as u64;
777            for i in (0..5).rev() {
778                digits[i] = (v % 85) as u8 + b'!';
779                v /= 85;
780            }
781            let take = chunk.len() + 1;
782            for &digit in &digits[..take] {
783                ascii85.push(digit as char);
784            }
785        }
786        ascii85.push_str("~>");
787
788        let mut dict = PdfDictionary::new();
789        dict.insert(
790            "Filter".to_string(),
791            PdfValue::Array(vec![
792                PdfValue::Name("ASCII85Decode".into()),
793                PdfValue::Name("FlateDecode".into()),
794            ]),
795        );
796        let stream = make_stream(dict, ascii85.into_bytes());
797        assert_eq!(decode_stream(&stream).unwrap(), plaintext);
798    }
799
800    #[test]
801    fn decodes_ascii_hex() {
802        let encoded = b"48656C6C6F>".to_vec();
803        let mut dict = PdfDictionary::new();
804        dict.insert(
805            "Filter".to_string(),
806            PdfValue::Name("ASCIIHexDecode".into()),
807        );
808        let stream = make_stream(dict, encoded);
809        assert_eq!(decode_stream(&stream).unwrap(), b"Hello".to_vec());
810    }
811
812    #[test]
813    fn rejects_unsupported_predictor() {
814        let mut dict = PdfDictionary::new();
815        let mut parms = PdfDictionary::new();
816        parms.insert("Predictor".to_string(), PdfValue::Integer(3));
817        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
818        let stream = make_stream(dict, vec![0, 0, 0, 0]);
819        match decode_stream(&stream) {
820            Err(PdfError::Unsupported(msg)) => {
821                assert!(msg.contains("predictor"), "got: {msg}")
822            }
823            other => panic!("expected Unsupported, got: {other:?}"),
824        }
825    }
826
827    /// Encode `input` with the TIFF-compatible LZW variant used by PDF
828    /// (9–12-bit codes, 256 = CLEAR, 257 = EOD). `early_change` mirrors
829    /// `DecodeParms /EarlyChange`: `true` switches width one code earlier.
830    fn encode_lzw(input: &[u8], early_change: bool) -> Vec<u8> {
831        use std::collections::HashMap;
832
833        let mut out: Vec<u8> = Vec::new();
834        let mut bit_buffer: u64 = 0;
835        let mut bit_count: u32 = 0;
836        let flush_code = |code: u32,
837                          width: u32,
838                          bit_buffer: &mut u64,
839                          bit_count: &mut u32,
840                          out: &mut Vec<u8>| {
841            *bit_buffer = (*bit_buffer << width) | u64::from(code);
842            *bit_count += width;
843            while *bit_count >= 8 {
844                *bit_count -= 8;
845                out.push(((*bit_buffer >> *bit_count) & 0xFF) as u8);
846                *bit_buffer &= (1u64 << *bit_count) - 1;
847            }
848        };
849
850        // Start every stream with CLEAR.
851        flush_code(256, 9, &mut bit_buffer, &mut bit_count, &mut out);
852
853        let mut dict: HashMap<Vec<u8>, u32> = HashMap::new();
854        for b in 0u32..256 {
855            dict.insert(vec![b as u8], b);
856        }
857        let mut next_code: u32 = 258;
858        let mut code_width: u32 = 9;
859
860        let mut buffer: Vec<u8> = Vec::new();
861        for &byte in input {
862            let mut extended = buffer.clone();
863            extended.push(byte);
864            if dict.contains_key(&extended) {
865                buffer = extended;
866            } else {
867                let code = dict[&buffer];
868                flush_code(code, code_width, &mut bit_buffer, &mut bit_count, &mut out);
869                dict.insert(extended, next_code);
870                next_code += 1;
871                let threshold = if early_change {
872                    (1u32 << code_width) - 1
873                } else {
874                    1u32 << code_width
875                };
876                if next_code >= threshold && code_width < 12 {
877                    code_width += 1;
878                }
879                buffer = vec![byte];
880            }
881        }
882        if !buffer.is_empty() {
883            let code = dict[&buffer];
884            flush_code(code, code_width, &mut bit_buffer, &mut bit_count, &mut out);
885        }
886        flush_code(257, code_width, &mut bit_buffer, &mut bit_count, &mut out);
887        if bit_count > 0 {
888            out.push(((bit_buffer << (8 - bit_count)) & 0xFF) as u8);
889        }
890        out
891    }
892
893    #[test]
894    fn decodes_lzw_spec_example() {
895        // PDF 1.7 spec § 7.4.4.3, Annex A.3: "-----A---B" encodes to the
896        // 8 nine-bit codes 256, 45, 258, 258, 65, 259, 66, 257, which pack
897        // MSB-first into these nine bytes.
898        let data = vec![0x80, 0x0B, 0x60, 0x50, 0x22, 0x0C, 0x0C, 0x85, 0x01];
899        let mut dict = PdfDictionary::new();
900        dict.insert("Filter".to_string(), PdfValue::Name("LZWDecode".into()));
901        let stream = make_stream(dict, data);
902        assert_eq!(decode_stream(&stream).unwrap(), b"-----A---B".to_vec());
903    }
904
905    #[test]
906    fn decodes_lzw_roundtrip_default_early_change() {
907        let plaintext = b"the quick brown fox jumps over the lazy dog".to_vec();
908        let encoded = encode_lzw(&plaintext, true);
909        let mut dict = PdfDictionary::new();
910        dict.insert("Filter".to_string(), PdfValue::Name("LZWDecode".into()));
911        let stream = make_stream(dict, encoded);
912        assert_eq!(decode_stream(&stream).unwrap(), plaintext);
913    }
914
915    #[test]
916    fn decodes_lzw_roundtrip_early_change_zero() {
917        let plaintext = b"the quick brown fox jumps over the lazy dog".to_vec();
918        let encoded = encode_lzw(&plaintext, false);
919        let mut dict = PdfDictionary::new();
920        dict.insert("Filter".to_string(), PdfValue::Name("LZWDecode".into()));
921        let mut parms = PdfDictionary::new();
922        parms.insert("EarlyChange".to_string(), PdfValue::Integer(0));
923        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
924        let stream = make_stream(dict, encoded);
925        assert_eq!(decode_stream(&stream).unwrap(), plaintext);
926    }
927
928    #[test]
929    fn decodes_lzw_with_tiff_predictor() {
930        // Original 2 rows × 4 bytes, 1 colour, 8 bits per component.
931        // TIFF predictor leaves the first byte per row as-is and stores
932        // the rest as delta from the previous byte. The LZW filter sits
933        // on top: it compresses the predictor-encoded bytes.
934        let original: [u8; 8] = [10, 20, 30, 40, 15, 22, 33, 44];
935        let mut predictor_encoded = Vec::new();
936        for row in original.chunks(4) {
937            predictor_encoded.push(row[0]);
938            for index in 1..row.len() {
939                predictor_encoded.push(row[index].wrapping_sub(row[index - 1]));
940            }
941        }
942        let lzw_bytes = encode_lzw(&predictor_encoded, true);
943        let mut dict = PdfDictionary::new();
944        dict.insert("Filter".to_string(), PdfValue::Name("LZWDecode".into()));
945        let mut parms = PdfDictionary::new();
946        parms.insert("Predictor".to_string(), PdfValue::Integer(2));
947        parms.insert("Columns".to_string(), PdfValue::Integer(4));
948        dict.insert("DecodeParms".to_string(), PdfValue::Dictionary(parms));
949        let stream = make_stream(dict, lzw_bytes);
950        assert_eq!(decode_stream(&stream).unwrap(), original.to_vec());
951    }
952
953    #[test]
954    fn decodes_lzw_exercises_code_width_transitions() {
955        // Build an input long enough to force the dictionary past 511
956        // entries so the decoder exercises the 9→10 and 10→11 bit width
957        // transitions. ~1200 unique trigrams from a pangram-ish repeat
958        // suffices.
959        let mut plaintext = Vec::new();
960        for i in 0u16..1200 {
961            plaintext.push(b'a' + (i % 26) as u8);
962            plaintext.push(b'A' + (i % 26) as u8);
963            plaintext.push(b'0' + (i % 10) as u8);
964        }
965        let encoded = encode_lzw(&plaintext, true);
966        let mut dict = PdfDictionary::new();
967        dict.insert("Filter".to_string(), PdfValue::Name("LZWDecode".into()));
968        let stream = make_stream(dict, encoded);
969        assert_eq!(decode_stream(&stream).unwrap(), plaintext);
970    }
971
972    #[test]
973    fn decodes_run_length_literal_runs() {
974        // Length byte 2 means "copy next 3 bytes literally". EOD = 128.
975        let encoded = vec![2, b'A', b'B', b'C', 128];
976        let mut dict = PdfDictionary::new();
977        dict.insert(
978            "Filter".to_string(),
979            PdfValue::Name("RunLengthDecode".into()),
980        );
981        let stream = make_stream(dict, encoded);
982        assert_eq!(decode_stream(&stream).unwrap(), b"ABC".to_vec());
983    }
984
985    #[test]
986    fn decodes_run_length_repeat_runs() {
987        // Length byte 0xFF (255) means "repeat next byte (257-255)=2 times".
988        let encoded = vec![0xFF, b'Z', 128];
989        let mut dict = PdfDictionary::new();
990        dict.insert("Filter".to_string(), PdfValue::Name("RL".into()));
991        let stream = make_stream(dict, encoded);
992        assert_eq!(decode_stream(&stream).unwrap(), b"ZZ".to_vec());
993    }
994
995    #[test]
996    fn decodes_run_length_mixed_runs_without_eod() {
997        // "ABBBCD" packed as literal A, repeat B x3, literal CD. No trailing
998        // EOD byte — some producers omit it and we treat that as end of
999        // stream rather than corruption.
1000        let encoded = vec![0, b'A', 0xFE, b'B', 1, b'C', b'D'];
1001        let mut dict = PdfDictionary::new();
1002        dict.insert(
1003            "Filter".to_string(),
1004            PdfValue::Name("RunLengthDecode".into()),
1005        );
1006        let stream = make_stream(dict, encoded);
1007        assert_eq!(decode_stream(&stream).unwrap(), b"ABBBCD".to_vec());
1008    }
1009
1010    #[test]
1011    fn decodes_filter_chain_run_length_then_flate() {
1012        // Flate-compress a plaintext, then wrap the Flate bytes as a pure
1013        // literal RunLengthDecode stream. Decoding has to run RL first and
1014        // then Flate on the recovered payload — guarding the dispatch
1015        // order inside `decode_stream`.
1016        let plaintext = b"RunLengthInsideAFilterChain".to_vec();
1017        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1018        encoder.write_all(&plaintext).unwrap();
1019        let flate_bytes = encoder.finish().unwrap();
1020
1021        // Encode `flate_bytes` as RunLengthDecode using only literal runs
1022        // of up to 128 bytes followed by the 128 EOD marker.
1023        let mut rl_bytes = Vec::new();
1024        let mut offset = 0usize;
1025        while offset < flate_bytes.len() {
1026            let run_len = (flate_bytes.len() - offset).min(128);
1027            rl_bytes.push((run_len - 1) as u8);
1028            rl_bytes.extend_from_slice(&flate_bytes[offset..offset + run_len]);
1029            offset += run_len;
1030        }
1031        rl_bytes.push(128);
1032
1033        let mut dict = PdfDictionary::new();
1034        dict.insert(
1035            "Filter".to_string(),
1036            PdfValue::Array(vec![
1037                PdfValue::Name("RunLengthDecode".into()),
1038                PdfValue::Name("FlateDecode".into()),
1039            ]),
1040        );
1041        let stream = make_stream(dict, rl_bytes);
1042        assert_eq!(decode_stream(&stream).unwrap(), plaintext);
1043    }
1044
1045    #[test]
1046    fn rejects_run_length_truncated_literal_run() {
1047        // Length byte 3 claims 4 bytes of literal but only 2 follow.
1048        let encoded = vec![3, b'A', b'B'];
1049        let mut dict = PdfDictionary::new();
1050        dict.insert(
1051            "Filter".to_string(),
1052            PdfValue::Name("RunLengthDecode".into()),
1053        );
1054        let stream = make_stream(dict, encoded);
1055        let err = decode_stream(&stream).unwrap_err();
1056        assert!(matches!(err, PdfError::Corrupt(_)), "got: {err:?}");
1057    }
1058
1059    #[test]
1060    fn rejects_run_length_truncated_repeat_run() {
1061        // Length byte 200 implies a repeat with a payload byte, but the
1062        // payload is missing (stream ends immediately after the length).
1063        let encoded = vec![200];
1064        let mut dict = PdfDictionary::new();
1065        dict.insert(
1066            "Filter".to_string(),
1067            PdfValue::Name("RunLengthDecode".into()),
1068        );
1069        let stream = make_stream(dict, encoded);
1070        let err = decode_stream(&stream).unwrap_err();
1071        assert!(matches!(err, PdfError::Corrupt(_)), "got: {err:?}");
1072    }
1073
1074    #[test]
1075    fn rejects_lzw_out_of_range_code() {
1076        // Single 9-bit code 0x1FF (= 511) after a CLEAR is outside the
1077        // still-256-entry dictionary and not equal to `next_code` yet,
1078        // so the decoder must refuse rather than silently emit.
1079        let mut out: Vec<u8> = Vec::new();
1080        let mut bit_buffer: u64 = 0;
1081        let mut bit_count: u32 = 0;
1082        let mut push = |code: u32, width: u32| {
1083            bit_buffer = (bit_buffer << width) | u64::from(code);
1084            bit_count += width;
1085            while bit_count >= 8 {
1086                bit_count -= 8;
1087                out.push(((bit_buffer >> bit_count) & 0xFF) as u8);
1088                bit_buffer &= (1u64 << bit_count) - 1;
1089            }
1090        };
1091        push(256, 9); // CLEAR
1092        push(511, 9); // invalid
1093        if bit_count > 0 {
1094            out.push(((bit_buffer << (8 - bit_count)) & 0xFF) as u8);
1095        }
1096        let mut dict = PdfDictionary::new();
1097        dict.insert("Filter".to_string(), PdfValue::Name("LZWDecode".into()));
1098        let stream = make_stream(dict, out);
1099        let err = decode_stream(&stream).unwrap_err();
1100        assert!(matches!(err, PdfError::Corrupt(_)), "got: {err:?}");
1101    }
1102}
pdf_objects/stream.rs

pdf_objects/
stream.rs